
***** Is Europe an optimal Political Area?                *****
***** Alberto Alesina, Guido Tabellini & Francesco Trebbi *****

***** Section V.b

*Set up 
clear all
set mem 10g
set matsize 11000
set more off
version 13

use "data/EVS_GSS_final.dta", clear

cap mkdir "section5"
cap mkdir "appendix"

** US state
keep if country=="US"
decode us_state, gen(state)
keep if state=="California" | state=="New York" | state=="Texas" | state=="North Carolina" | state=="Ohio" | state=="Michigan" | state=="Illinois" | state=="Florida" | state=="Pennsylvania"
replace state="New_York" if state=="New York"
replace state="North_Carolina" if state=="North Carolina"
replace area2=state

** Tab some covariates to use
qui tab marital_status, gen(X_marital_status)
qui tab education, gen(X_education)
qui tab employment, gen(X_employment)
qui tab income_recoded, gen(X_income_recoded)
qui tab size_of_town, gen(X_size_of_town)
qui tab education1, gen(X_education_v)

** Setting waves
gen wave=.
replace wave=1 if year==1981 | year==1982 | year==1983 | year==1984 | year==1986
replace wave=2 if year==1990 | year==1991 | year==1993
replace wave=3 if year==1998 | year==1999 | year==2000
replace wave=4 if year==2006 | year==2008 | year==2009 | year==2010 
drop if wave==.


** Cultural Variables 
* Reduced list
global cult Y_obedience Y_trust Y_ideology Y_religious Y_divorce Y_euthan Y_suicide Y_altruism Y_hardwork
* Extended list1
global ecult1 Y_obedience Y_trust Y_ideology Y_religious Y_divorce Y_euthan Y_suicide Y_altruism Y_hardwork Y_redistrib Y_workingfemale Y_careerfemale Y_preschoolmother
* Extended list2
global ecult2 Y_obedience Y_trust Y_ideology Y_religious Y_divorce Y_euthan Y_suicide Y_altruism Y_hardwork Y_redistrib Y_workingfemale Y_careerfemale Y_preschoolmother Y_private Y_equalize

** Covariates 
* Reduced list
global cov X_age X_sex X_marital_status* X_employment1 X_employment2 X_employment3 X_employment4 X_employment5 X_employment6 X_income_recoded* X_education_v*
* Extended list1
global ecov1 X_age X_sex X_marital_status* X_employment1 X_employment2 X_employment3 X_employment4 X_employment5 X_employment6 X_income_recoded* X_size_of_town* X_education_v*
* Extended list2
global ecov2 X_age X_sex X_marital_status* X_employment1 X_employment2 X_employment3 X_employment4 X_employment5 X_employment6 X_income_recoded* X_size_of_town* X_education*

drop country
gen country=area2

********************************************************************************
program gaussian_kernel
	#delimit ;
	syntax varlist ,
	[
	idvar(varname)
	
	genfile(string) name1(name) name2(name) scorevar(name)
	]
	;
	#delimit cr
	*
	save rtmp.dta, replace
	keep `idvar' `varlist' 
	/*Set kernel bandwidth equal to the number of dimensions following 
	Hainmuller and Hazlett KRLS Package:
	"By default, the bandwidth is set equal to D (the number of dimensions) which 
	typically yields a reasonable scaling of the distances between observations 
	in the standardized data that is used for the fitting."
	*/
	tempname sigmasq
	qui describe
	scalar `sigmasq' = r(k) - 1
	tempname alpha
	scalar `alpha' = 1/(2*`sigmasq')
	gen const = 1
	preserve
		sort const
		foreach var of varlist `idvar' `varlist'  {
			rename `var' `var'2
		}
		save tmp0.dta, replace
	restore
	*
	sort const
	joinby const using tmp0.dta
	drop const
	foreach var of varlist `varlist' {
		gen double `var'__d = `var' - `var'2
		replace `var'__d = `var'__d^2
	}
	*Euclidean distance of the vectors
	egen double ed = rsum(*__d)
	replace ed = sqrt(ed)
	gen double gk = `alpha'*ed^2
	*get 1-gaussian kernel measure of distance 0= same vectors, 1= maximally distant vectors.
	replace gk = 1-exp(-gk)
	rename gk `scorevar'
	rename `idvar' `name1'
	rename `idvar'2 `name2'
	keep `name1' `name2' `scorevar' 
	save `genfile', replace
	use rtmp.dta, clear
end

program cosine
	#delimit ;
	syntax varlist ,
	[
	idvar(varname)
	
	genfile(string) name1(name) name2(name) scorevar(name)
	]
	;
	#delimit cr
	*
	save rtmp.dta, replace
	keep `idvar' `varlist' 
	/**/
	gen const = 1
	preserve
		sort const
		foreach var of varlist `idvar' `varlist'  {
			rename `var' `var'2
		}
		save tmp0.dta, replace
	restore
	*
	sort const
	joinby const using tmp0.dta
	drop const
	foreach var of varlist `varlist' {
		gen double `var'__d = `var'*`var'2
		gen double `var'__2 = `var'^2
		gen double `var'__22= `var'2^2
	}
	
	*Cosine
	egen double ed = rsum(*__d)
	egen double ed2 = rsum(*__2)
	replace ed2 = sqrt(ed2)	
	egen double ed22 = rsum(*__22)
	replace ed22 = sqrt(ed22)
	gen double gk = ed/(ed2*ed22)
	*get cosine measure of distance 0= same vectors, 1= maximally distant vectors.
	replace gk = acos(gk)/_pi
	rename gk `scorevar'
	rename `idvar' `name1'
	rename `idvar'2 `name2'
	keep `name1' `name2' `scorevar' 
	save `genfile', replace
	use rtmp.dta, clear
end
********************************************************************************

global N = 60
set seed 364011739

foreach var of varlist $cov $cult {
	drop if missing(`var')
}

gen random = runiform() 
bysort country wave (random): keep if _n<=$N
foreach var of varlist $cov $cult {
	qui sum `var'
	replace `var' = (`var'-r(mean))/r(sd)
}

** Some utility files for below
preserve
keep persno country wave latitude longitude CapitalLatitude CapitalLongitude
rename persno ROW 
rename country country_row
rename latitude latitude_row
rename longitude longitude_row
rename CapitalLatitude CapitalLatitude_row
rename CapitalLongitude CapitalLongitude_row
sort ROW
save "tmp1r.dta", replace
restore

preserve
keep persno country wave latitude longitude CapitalLatitude CapitalLongitude
rename persno COL 
rename country country_col
rename latitude latitude_col
rename longitude longitude_col
rename CapitalLatitude CapitalLatitude_col
rename CapitalLongitude CapitalLongitude_col
sort COL
save "tmp1c.dta", replace
restore

global i = 1
foreach var of varlist $cult {
reg `var' $cov
global i = $i + 1
predict res_`var', resid
}

** Generate distance between randomly sampled individuals (paired within each wave and all countries)
forvalues i=1(1)4 {
	preserve
		keep if wave==`i'
		global wave = `i'
		*drop variable if no variance in the sample -messes up inversions as distance is always 0:
		foreach var in $cov $cult {
			qui sum `var'
			global S = r(sd)
			if $S == 0 {
				drop `var'
			}
		}
		*
		*x basic socio-economic covariates set
		qui gaussian_kernel $cov, idvar(persno) genfile(tmp_x`i') scorevar(cov_mdist) name1(ROW) name2(COL)
		*X extended socio-economic covariates set, only for waves 2 to 4
		if $wave ==2 | $wave ==3 | $wave ==4 {
			qui gaussian_kernel $ecov1, idvar(persno) genfile(tmp_eX`i') scorevar(ecov_mdist) name1(ROW) name2(COL)
			*qui cosine $ecov1, idvar(persno) genfile(tmp_eX`i') scorevar(ecov_mdist) name1(ROW) name2(COL)
		}
		*y basic culture
		qui gaussian_kernel $cult, idvar(persno) genfile(tmp_y`i') scorevar(cult_mdist) name1(ROW) name2(COL)
		*Y extended culture
		qui gaussian_kernel $ecult1, idvar(persno) genfile(tmp_eY`i') scorevar(ecult_mdist) name1(ROW) name2(COL)
		*qui cosine $ecult1, idvar(persno) genfile(tmp_eY`i') scorevar(ecult_mdist) name1(ROW) name2(COL)
		
		** residuals of y basic culture on covariates 
		qui gaussian_kernel res_*, idvar(persno) genfile(tmp_resy`i') scorevar(res_cult_mdist) name1(ROW) name2(COL)
		*
		*
	restore
}
*
**Change the matrices of distances into data
forvalues i=1(1)4 {
	global wave = `i'
	*
	use "tmp_x`i'.dta", clear
	gen wave =`i'
	save "tmp_x`i'.dta", replace
	*
	if $wave == 2| $wave ==3 | $wave ==4 {
		use "tmp_eX`i'.dta", clear
		gen wave =`i'
		save "tmp_eX`i'.dta", replace
	}
	*
	use "tmp_y`i'.dta", clear
	gen wave =`i'
	save "tmp_y`i'.dta", replace
	*
	
	use "tmp_eY`i'.dta", clear
	gen wave =`i'
	save "tmp_eY`i'.dta", replace
	*
	
	use "tmp_resy`i'.dta", clear
	gen wave =`i'
	save "tmp_resy`i'.dta", replace
}
*
use "tmp_x1.dta", clear
append using "tmp_x2.dta"
append using "tmp_x3.dta"
append using "tmp_x4.dta"
sort ROW COL
save "tmp_x.dta", replace
*
use "tmp_eX3.dta", clear
append using "tmp_eX4.dta"
sort ROW COL
save "tmp_eX.dta", replace
*
use "tmp_y1.dta", clear
append using "tmp_y2.dta"
append using "tmp_y3.dta"
append using "tmp_y4.dta"
sort ROW COL
save "tmp_y.dta", replace
*
use "tmp_eY1.dta", clear
append using "tmp_eY2.dta"
append using "tmp_eY3.dta"
append using "tmp_eY4.dta"
sort ROW COL
save "tmp_eY.dta", replace
*
use "tmp_resy1.dta", clear
append using "tmp_resy2.dta"
append using "tmp_resy3.dta"
append using "tmp_resy4.dta"
sort ROW COL
save "tmp_resy.dta", replace
*
use "tmp_x.dta", clear
merge ROW COL using "tmp_eX.dta" 
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_y.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_eY.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_resy.dta"
tab _m
drop _m
*
label var cov_mdist "Dist. in Covariates"
label var ecov_mdist "Dist. in Ext. Covariates"
label var cult_mdist "Dist. in Culture"
label var ecult_mdist "Dist. in Ext. Culture"
label var res_cult_mdist "Dist. in Cultural residuals"

*Identifying countries
sort ROW
merge ROW using "tmp1r.dta"
keep if _m~=2
drop _m
sort COL
merge COL using "tmp1c.dta"
keep if _m~=2
drop _m
gen same = (country_row==country_col)
*Keeping unique correlations
* drop diagonal 
drop if ROW==COL
*keep only lower triangular
isid ROW COL
sort ROW COL
gen id1 = _n
sort COL ROW
gen id2 = _n 
rowsort id1 id2, gen(id3 id4)
keep if id1==id3&id2==id4
drop id1 id2 id3 id4
replace country_row = "GB" if country_row == "GB-GBN"
replace country_col = "GB" if country_col == "GB-GBN"

ssc install geodist, replace
geodist latitude_row longitude_row latitude_col  longitude_col, gen(region_dist)
geodist CapitalLatitude_row CapitalLongitude_row CapitalLatitude_col  CapitalLongitude_col, gen(country_dist)
label var region_dist "Dist. between regions"
label var country_dist "Dist. between countries"

global country5 "California Texas Illinois Michigan Florida Ohio North_Carolina Pennsylvania New_York"
global lc "Unconditional Conditional"

gen tmp1= (wave==1&same==1)
gen tmp2= (wave==1&same==0)
gen tmp3= (wave==3&same==1)
gen tmp4= (wave==3&same==0)
gen tmp5= (wave==4&same==1)
gen tmp6= (wave==4&same==0)

save "data_to_use.dta", replace


* Figure A.14
use "data_to_use.dta", clear
	
kdensity cult_mdist if tmp2, addplot(kdensity cult_mdist if tmp6, lw(medthick) lp(solid)) ///
legend(lab(1 "Diff. states W1") label(2 "Diff. states W4")) lw(medthick) lp(dash) title("Cultural distance")
graph save "fig7_all_states", replace

kdensity res_cult_mdist if tmp2, addplot(kdensity res_cult_mdist if tmp6, lw(medthick) lp(solid)) ///
legend(lab(1 "Diff. states W1") label(2 "Diff. states W4")) lw(medthick) lp(dash) title("Distance in cultural residuals")
graph save "fig11_all_states", replace
graph combine fig7_all_states.gph fig11_all_states.gph, cols(2) title("all 9 US states, between")

qui graph export "appendix/figure_A14a.png", width(4000) height(2300) replace
	 
kdensity cult_mdist if tmp1, addplot(kdensity cult_mdist if tmp5, lw(medthick) lp(solid)) ///
legend(lab(1 "Same State W1") label(2 "Same State W4")) lw(medthick) lp(dash) title("Cultural distance")
graph save "fig7_all_states_same", replace

kdensity res_cult_mdist if tmp1, addplot(kdensity res_cult_mdist if tmp5, lw(medthick) lp(solid)) ///
legend(lab(1 "Same State W1") label(2 "Same State W4")) lw(medthick) lp(dash) title("Distance in cultural residuals")
graph save "fig11_all_states_same", replace
graph combine fig7_all_states_same.gph fig11_all_states_same.gph, cols(2) title("all 9 US states, within")

qui graph export "appendix/figure_A14b.png", width(4000) height(2300) replace

drop tmp*
erase fig7_all_states.gph
erase fig11_all_states.gph
erase fig7_all_states_same.gph
erase fig11_all_states_same.gph
***


* Table A.8
use "data_to_use.dta", clear
global country3 "California Texas Illinois Michigan Florida Ohio North_Carolina Pennsylvania New_York"
global country4 "California Texas Illinois Michigan Florida Ohio North_Carolina Pennsylvania New_York"
global country5 "California Texas Illinois Michigan Florida Ohio North_Carolina Pennsylvania New_York"
global lc "Unconditional Conditional"

drop if wave==2 | wave==3 

* Generate dummy for wave 4 and interaction with covariates
g wave4 = (wave==4)
g wave4_cov = wave4 * cov_mdist

local i = 1
mat A = J(50,4,.)
foreach countryi of global country5 {
	*
	
	capture qui reg cult_mdist wave4     if (country_row=="`countryi'"|country_col=="`countryi'") & same==0
	capture sca a1 = _b[wave4]
	capture sca a2 = _se[wave4]
	cap sca a4 = _b[_cons]
	qui test wave4
	capture sca a3 = r(p)
	
	capture qui reg cult_mdist cov_mdist wave4 wave4_cov     if (country_row=="`countryi'"|country_col=="`countryi'") & same==0
	capture sca b1 = _b[wave4]
	capture sca b2 = _se[wave4]
	cap sca b4 = _b[_cons]
	qui test wave4
	capture sca b3 = r(p)	

	
	capture qui reg cult_mdist wave4     if (country_row=="`countryi'"|country_col=="`countryi'") & same==1
	capture sca c1 = _b[wave4]
	capture sca c2 = _se[wave4]
	cap sca c4 = _b[_cons]
	qui test wave4
	capture sca c3 = r(p)
	
	capture qui reg cult_mdist cov_mdist wave4 wave4_cov    if (country_row=="`countryi'"|country_col=="`countryi'") & same==1
	capture sca d1 = _b[wave4]
	capture sca d2 = _se[wave4]
	cap sca d4 = _b[_cons]
	qui test wave4
	cap sca d4 = _b[_cons]
	capture sca d3 = r(p)
	
	*
	mat A[`i',1]   = a1
	mat A[`i'+1,1] = a2
	mat A[`i'+2,1] = a3
	mat A[`i'+3,1] = a4
	
	mat A[`i',2]   = b1
	mat A[`i'+1,2] = b2
	mat A[`i'+2,2] = b3
	mat A[`i'+3,2] = b4
	
	mat A[`i',3]   = c1
	mat A[`i'+1,3] = c2
	mat A[`i'+2,3] = c3
	mat A[`i'+3,3] = c4
	
	mat A[`i',4]   = d1
	mat A[`i'+1,4] = d2
	mat A[`i'+2,4] = d3
	mat A[`i'+3,4] = d4
	
	di "`countryi'"
	sca list a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4
	local i = `i'+ 5
}
	* Aggregate
	capture qui reg cult_mdist wave4     if same==0
	capture sca a1 = _b[wave4]
	capture sca a2 = _se[wave4]
	cap sca a4 = _b[_cons]
	qui test wave4
	capture sca a3 = r(p)
	
	capture qui reg cult_mdist cov_mdist wave4 wave4_cov     if same==0
	capture sca b1 = _b[wave4]
	capture sca b2 = _se[wave4]
	cap sca b4 = _b[_cons]
	qui test wave4
	capture sca b3 = r(p)	
	
	capture qui reg cult_mdist wave4     if same==1
	capture sca c1 = _b[wave4]
	capture sca c2 = _se[wave4]
	cap sca c4 = _b[_cons]
	qui test wave4
	capture sca c3 = r(p)
	
	capture qui reg cult_mdist cov_mdist wave4 wave4_cov     if same==1
	capture sca d1 = _b[wave4]
	capture sca d2 = _se[wave4]
	cap sca d4 = _b[_cons]
	qui test wave4
	capture sca d3 = r(p)
	
	mat A[`i',1]   = a1
	mat A[`i'+1,1] = a2
	mat A[`i'+2,1] = a3
	mat A[`i'+3,1] = a4
	
	mat A[`i',2]   = b1
	mat A[`i'+1,2] = b2
	mat A[`i'+2,2] = b3
	mat A[`i'+3,2] = b4
	
	mat A[`i',3]   = c1
	mat A[`i'+1,3] = c2
	mat A[`i'+2,3] = c3
	mat A[`i'+3,3] = c4
	
	mat A[`i',4]   = d1
	mat A[`i'+1,4] = d2
	mat A[`i'+2,4] = d3
	mat A[`i'+3,4] = d4
*
global country5bis California . . avg_W1 . Texas . . avg_W1 . Illinois . . avg_W1 . Michigan . . avg_W1 . Florida . . avg_W1 . Ohio . . avg_W1 . North_Carolina . . avg_W1 . Pennsylvania . . avg_W1 . New_York . . avg_W1 .  
mat rown A = $country5bis
mat coln A = $lc
mata: A = st_matrix("A")

** Output graphs & tables
mata:
        dh = _docx_new()
        _docx_set_font(dh, "Garamond")
        _docx_set_size(dh, 15)
end

mata: e = _docx_paragraph_new(dh,"")
mata: e = _docx_paragraph_add_text(dh, "Table: Avg. cultural distance between each states citizen & US citizens from other states. Unconditional and conditional on socio-economic covariates.")
mata: e = _docx_set_size(dh, 30)
mata: e = _docx_text_set_bold(dh, 1)
mata: e = _docx_paragraph_new(dh,"")
mata: e = _docx_set_size(dh, 15)
mata: e = _docx_add_matrix(dh, "A", "%10.6f", 1, 1)
mata: e = _docx_add_pagebreak(dh)

mata:
	_docx_save(dh, "appendix/table_A8.docx",1)
	_docx_close(dh)
end

** Housekeeping
foreach file in tmp1c.dta tmp1r.dta tmp_x1.dta tmp_x2.dta tmp_x3.dta tmp_x4.dta ///
	tmp_y1.dta tmp_y2.dta tmp_y3.dta tmp_y4.dta tmp_eY1.dta tmp_eY2.dta 		///
	tmp_eY3.dta tmp_eY4.dta tmp_eX3.dta tmp_eX4.dta rtmp.dta tmp_eX2.dta 		///
	tmp_resy1.dta tmp_resy2.dta tmp_resy3.dta tmp_resy4.dta tmp0.dta     		///
	tmp_eY.dta  tmp_y.dta  tmp_eX.dta  tmp_x.dta  tmp_resy.dta  				///
	tmp_resyi.dta tmp_resyi4.dta tmp_resyt.dta tmp_resyt4.dta tmp_xi.dta 		///
	tmp_xi4.dta tmp_xt.dta tmp_xt4.dta data_to_use.dta{
erase `file'
}


